0.1 Import packages

library(pROC)
library(ggpubr)
library(Biostrings)
library(data.table)
library(dplyr)

library(caret)
library(purrr)

library(tidyverse)
library(yardstick)
library(doParallel)
library(foreach)
library(stringdist)
library(magrittr)

1 Functions

# Function to calculate the F score, given beta, precision and recall.
 calculate_f_beta = function(beta, precision, recall) {
   return((beta^2 + 1)*(precision*recall / ((beta^2)*precision + recall)))
 }

2 Import datasets

combinedData = readRDS("PanHLA_combinedData.rds") %>% as.data.table
combinedData %>% DT::datatable(caption="Model Results")
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
FullDataset = readRDS("PanHLA_FullDataset.rds")

# write.table(combinedData,"PanHLA_combinedData.txt",sep="\t",row.names = F,quote=F)
# write.table(FullDataset,"PanHLA_FullDataset.txt",sep="\t",row.names = F,quote=F)

3 Analysis

3.1 Produce ROC-AUC

# What is the accuracy of each model?
accuracyDF=combinedData %>% dplyr::group_by(Dataset) %>% summarise(Accuracy=sum(ImmunogenicityPrediction==Immunogenicity)/nrow(FullDataset))
# What is the ROC-AUC of each model?
AUCDF = combinedData %>% group_by(Dataset) %>% dplyr::summarise(ROC=as.numeric(roc(Immunogenicity ~ ImmunogenicityScore)$auc))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
NETTEPIAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'NETTEPI'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
IPREDAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'IPRED'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
IEDBMODELAUC=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'IEDB'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
REPITOPE_AUC_CV=roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'REPITOPE'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
NetTepi_NETMHC.AUC.CV = roc(Immunogenicity ~ ImmunogenicityScore,data=combinedData %>% filter(Dataset %in% 'NETTEPI_netMHCpan4'))
## Setting levels: control = Positive, case = Negative
## Setting direction: controls > cases
# Create ggroc object 
roc.AUC=ggroc(list(IEDB_Model=IEDBMODELAUC,iPred=IPREDAUC,NetTepi=NETTEPIAUC,NetTepi_NetMHCpan4=NetTepi_NETMHC.AUC.CV,REpitope=REPITOPE_AUC_CV),legacy.axes = TRUE,size=1.25) + theme_bw() +
  annotate("size"=5,"text",x=.55,y=.185,label=paste0("IEDB:     ",round(auc(IEDBMODELAUC),digits=3),"\n","iPred:     ",round(auc(IPREDAUC),digits=3),"\n","NetTepi: ",round(auc(NETTEPIAUC),digits=3),"\n","NetTepi_NetMHCpan4: ",round(auc(NetTepi_NETMHC.AUC.CV),digits=3),"\n","Repitope: ",round(auc(REPITOPE_AUC_CV),digits=3))) + font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=12) + geom_abline(size=1,intercept = 0, slope = 1,color = "darkgrey", linetype = "dashed")

3.2 Produce PR-AUC

# What is each models PR-AUC?
combinedData$Immunogenicity = factor(combinedData$Immunogenicity,levels = c("Positive","Negative"))
PR_AUC_COMBINED=combinedData %>% group_by(Dataset) %>% pr_auc(Immunogenicity,ImmunogenicityScore)
PR_AUC_COMBINED$.estimate=round(PR_AUC_COMBINED$.estimate,digits=3)

pr.AUC=combinedData %>% group_by(Dataset) %>% pr_curve(Immunogenicity,ImmunogenicityScore) %>% autoplot() + aes(size = Dataset)+scale_size_manual(values=c(1.25,1.25,1.25,1.25,1.25)) + ggtitle("Precision-Recall Curves") +annotate("size"=5,"text",x=.5,y=.155,label=paste0("IEDB: ",PR_AUC_COMBINED$.estimate[1],"\n","iPred: ",PR_AUC_COMBINED$.estimate[2],"\n","NetTepi: ",PR_AUC_COMBINED$.estimate[3],"\n","NetTepi_NetMHCpan4: ",PR_AUC_COMBINED$.estimate[4],"\n","Repitope: ",PR_AUC_COMBINED$.estimate[5])) + geom_hline(size=1,color="darkgrey",yintercept = nrow(FullDataset[FullDataset$Immunogenicity=='Positive',]) / nrow(FullDataset),linetype="dashed")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.title",color="white") + font("legend.text",size=10)

3.3 Plot PR-AUC

  • Done to produce plots of the same size.
gl <- lapply(list(pr.AUC,roc.AUC), ggplotGrob)
library(grid)
widths <- do.call(unit.pmax, lapply(gl, "[[", "widths"))
heights <- do.call(unit.pmax, lapply(gl, "[[", "heights"))
lg <- lapply(gl, function(g) {g$widths <- widths; g$heights <- heights; g})
grid.newpage()
grid.draw(lg[[1]])

3.4 Plot ROC-AUC

  • Main text Figure 2A
grid.newpage()
grid.draw(lg[[2]])

3.5 Produce F1 Score

combinedData$Immunogenicity = factor(combinedData$Immunogenicity,levels=c("Positive","Negative"))
combinedData$ImmunogenicityPrediction = factor(combinedData$ImmunogenicityPrediction,levels=c("Positive","Negative"))

F1SCORE=combinedData %>% group_by(Dataset) %>% dplyr::summarise(F1_SCORE= confusionMatrix(positive = "Positive",mode = "prec_recall",
                                                                           reference=Immunogenicity,ImmunogenicityPrediction)$byClass["F1"])           

3.6 Produce Precision

PRECISIONCV=combinedData %>% group_by(Dataset) %>% dplyr::summarise(Precision= confusionMatrix(positive = "Positive",mode = "prec_recall",
                                                                           reference=Immunogenicity,ImmunogenicityPrediction)$byClass["Precision"])   

3.7 Produce Recall

RECALLCV=combinedData %>% group_by(Dataset) %>% dplyr::summarise(Recall= confusionMatrix(positive = "Positive",mode = "prec_recall",
                                                                           reference=Immunogenicity,ImmunogenicityPrediction)$byClass["Recall"]) %>% as.data.table                         

3.8 Produce Balanced accuracy

BALACCURACYCV=combinedData %>% group_by(Dataset) %>% dplyr::summarise(BalancedAccuracy= confusionMatrix(positive = "Positive",mode = "prec_recall",
                                                                           reference=Immunogenicity,ImmunogenicityPrediction)$byClass["Balanced Accuracy"])                                                                           

3.9 Combine all metrics

F0.5SCORE=inner_join(RECALLCV,PRECISIONCV) %>% group_by(Dataset) %>% dplyr::summarise(F_0.5 = calculate_f_beta(0.5,Precision,Recall))
## Joining, by = "Dataset"

3.10 Visualisation of all key metrics.

accuracyDF %>% inner_join(F1SCORE) %>% inner_join(PRECISIONCV) %>% inner_join(RECALLCV) %>% inner_join(F0.5SCORE) %>% inner_join(BALACCURACYCV) %>% melt(variable.name="Metric") %>% ggbarplot(
  x="Dataset",y="value",fill="Metric",label = T,lab.nb.digits = 2,alpha=0.6
) + facet_wrap(~Metric) + rotate_x_text(angle=90) + ylim(0,1)
## Joining, by = "Dataset"
## Joining, by = "Dataset"
## Joining, by = "Dataset"
## Joining, by = "Dataset"
## Joining, by = "Dataset"
## Warning in melt(., variable.name = "Metric"): The melt generic in data.table
## has been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(.). In the next version, this warning will become an error.
## Using Dataset as id variables
## Warning: attributes are not identical across measure variables; they will be
## dropped

3.11 IEDB Model Confusion Matrix

CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset %in% 'IEDB')  %>% select(Immunogenicity) %>% pull(),
                       levels = c("Negative","Positive")),
                       factor(combinedData %>% filter(Dataset %in% 'IEDB') %>% select(ImmunogenicityPrediction) %>% pull(),
                       levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative     2670      882
##   Positive     2172     1797
##                                           
##                Accuracy : 0.5939          
##                  95% CI : (0.5827, 0.6051)
##     No Information Rate : 0.6438          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2006          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##               Precision : 0.4528          
##                  Recall : 0.6708          
##                      F1 : 0.5406          
##              Prevalence : 0.3562          
##          Detection Rate : 0.2389          
##    Detection Prevalence : 0.5277          
##       Balanced Accuracy : 0.6111          
##                                           
##        'Positive' Class : Positive        
## 
table=data.frame(CM$table)

plotTable <- table %>%
  mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))


ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1,size=8) +
  scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
  theme_bw() +
  xlim(rev(levels(table$Reference))) + ggtitle("IEDB Immunogenicity Model")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10)
IEDB Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

Figure 3.1: IEDB Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

3.12 NetTepi Confusion Matrix

CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset %in% 'NETTEPI')  %>% select(Immunogenicity) %>% pull(),
                       levels = c("Negative","Positive")),
                       factor(combinedData %>% filter(Dataset %in% 'NETTEPI') %>% select(ImmunogenicityPrediction) %>% pull(),
                       levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative     3173     1053
##   Positive     1669     1626
##                                          
##                Accuracy : 0.6381         
##                  95% CI : (0.6271, 0.649)
##     No Information Rate : 0.6438         
##     P-Value [Acc > NIR] : 0.8525         
##                                          
##                   Kappa : 0.2494         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##               Precision : 0.4935         
##                  Recall : 0.6069         
##                      F1 : 0.5444         
##              Prevalence : 0.3562         
##          Detection Rate : 0.2162         
##    Detection Prevalence : 0.4381         
##       Balanced Accuracy : 0.6311         
##                                          
##        'Positive' Class : Positive       
## 
table=data.frame(CM$table)

plotTable <- table %>%
  mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))


ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1,size=8) +
  scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
  theme_bw() +
  xlim(rev(levels(table$Reference))) + ggtitle("NetTepi")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10)
NetTepi Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

Figure 3.2: NetTepi Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

3.13 NETTEPI_netMHCpan4 Confusion Matrix

CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset == 'NETTEPI_netMHCpan4')  %>% select(Immunogenicity) %>% pull(),
                       levels = c("Negative","Positive")),
                       factor(combinedData %>% filter(Dataset == 'NETTEPI_netMHCpan4') %>% select(ImmunogenicityPrediction) %>% pull(),
                       levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative     3085      996
##   Positive     1757     1683
##                                          
##                Accuracy : 0.634          
##                  95% CI : (0.623, 0.6449)
##     No Information Rate : 0.6438         
##     P-Value [Acc > NIR] : 0.9634         
##                                          
##                   Kappa : 0.2495         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##               Precision : 0.4892         
##                  Recall : 0.6282         
##                      F1 : 0.5501         
##              Prevalence : 0.3562         
##          Detection Rate : 0.2238         
##    Detection Prevalence : 0.4574         
##       Balanced Accuracy : 0.6327         
##                                          
##        'Positive' Class : Positive       
## 
table=data.frame(CM$table)

plotTable <- table %>%
  mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))


ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1,size=8) +
  scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
  theme_bw() +
  xlim(rev(levels(table$Reference)))+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10) + ggtitle("NETTEPI_netMHCpan4")
NetTepi Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

Figure 3.3: NetTepi Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

3.14 iPred Confusion Matrix

CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset == 'IPRED')  %>% select(Immunogenicity) %>% pull(),
                       levels = c("Negative","Positive")),
                       factor(combinedData %>% filter(Dataset == 'IPRED') %>% select(ImmunogenicityPrediction) %>% pull(),
                       levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative     2885      731
##   Positive     1957     1948
##                                           
##                Accuracy : 0.6426          
##                  95% CI : (0.6316, 0.6534)
##     No Information Rate : 0.6438          
##     P-Value [Acc > NIR] : 0.5909          
##                                           
##                   Kappa : 0.293           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##               Precision : 0.4988          
##                  Recall : 0.7271          
##                      F1 : 0.5917          
##              Prevalence : 0.3562          
##          Detection Rate : 0.2590          
##    Detection Prevalence : 0.5192          
##       Balanced Accuracy : 0.6615          
##                                           
##        'Positive' Class : Positive        
## 
table=data.frame(CM$table)

plotTable <- table %>%
  mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))


ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1,size=8) +
  scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
  theme_bw() +
  xlim(rev(levels(table$Reference))) + ggtitle("iPred")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10)
iPred Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

Figure 3.4: iPred Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

3.15 REPITOPE Confusion Matrix

CM=confusionMatrix(positive = "Positive",mode = "prec_recall",reference=factor(combinedData %>% filter(Dataset == 'REPITOPE')  %>% select(Immunogenicity) %>% pull(),
                       levels = c("Negative","Positive")),
                       factor(combinedData %>% filter(Dataset == 'REPITOPE') %>% select(ImmunogenicityPrediction) %>% pull(),
                       levels=c("Negative","Positive")))
CM
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative     3615      977
##   Positive     1227     1702
##                                           
##                Accuracy : 0.707           
##                  95% CI : (0.6965, 0.7172)
##     No Information Rate : 0.6438          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3741          
##                                           
##  Mcnemar's Test P-Value : 1.134e-07       
##                                           
##               Precision : 0.5811          
##                  Recall : 0.6353          
##                      F1 : 0.6070          
##              Prevalence : 0.3562          
##          Detection Rate : 0.2263          
##    Detection Prevalence : 0.3894          
##       Balanced Accuracy : 0.6910          
##                                           
##        'Positive' Class : Positive        
## 
table=data.frame(CM$table)

plotTable <- table %>%
  mutate(Performance = ifelse(table$Prediction == table$Reference, "Accurate", "Inaccurate")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))


ggplot(data = plotTable, mapping = aes(x = Reference, y = Prediction, fill = Performance, alpha = prop)) +
  geom_tile() +
  geom_text(aes(label = Freq), vjust = .5, fontface  = "bold", alpha = 1,size=8) +
  scale_fill_manual(values = c(Accurate = "green", Inaccurate = "red")) +
  theme_bw() +
  xlim(rev(levels(table$Reference))) + ggtitle("REpitope")+ font("xy.text",size=16,color="black")+ font("xlab",size=16,color="black")+ font("ylab",size=16,color="black") + font("legend.text",size=10)
REpitope Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups

Figure 3.5: REpitope Confusion Matrix. Color is relative to sensitivity/specificity by proportional outcomes within reference groups